{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x_train :\n",
      " [[33.  1.  0. ... 52.  0.  1.]\n",
      " [63.  1.  0. ... 52.  0.  1.]\n",
      " [71.  0.  0. ...  0.  0.  1.]\n",
      " ...\n",
      " [16.  0.  0. ...  8.  1.  0.]\n",
      " [48.  1.  0. ... 52.  0.  1.]\n",
      " [48.  0.  0. ...  0.  0.  1.]] (54256, 510) \n",
      "\n",
      "y_train :\n",
      " [1. 0. 0. ... 0. 0. 0.] (54256,) \n",
      "\n",
      "x_test :\n",
      " [[37.  1.  0. ... 52.  0.  1.]\n",
      " [48.  1.  0. ... 52.  0.  1.]\n",
      " [68.  0.  0. ...  0.  1.  0.]\n",
      " ...\n",
      " [38.  1.  0. ... 52.  0.  1.]\n",
      " [17.  0.  0. ... 40.  1.  0.]\n",
      " [22.  0.  0. ... 25.  1.  0.]] (27622, 510)\n"
     ]
    }
   ],
   "source": [
    "# 下载资料并做normalize，切为Training set和validation set\n",
    "import numpy as np\n",
    "\n",
    "np.random.seed(0)\n",
    "\n",
    "x_train_fpath = '../data/X_train'\n",
    "y_train_fpath = '../data/Y_train'\n",
    "x_test_fpath  = '../data/X_test'\n",
    "\n",
    "# 第一行是feature的名称，所以先执行next(f)跳过第一行的内容；第一个dimension是id，feature[1:]从第二个dimension开始读取\n",
    "with open(x_train_fpath) as f:\n",
    "    next(f)\n",
    "    x_train = np.array([line.strip('\\n').split(',')[1:]  for line in f], dtype = float)\n",
    "\n",
    "with open(y_train_fpath) as f:\n",
    "    next(f)\n",
    "    y_train = np.array([line.strip('\\n').split(',')[1]  for line in f], dtype = float)\n",
    "    \n",
    "with open(x_test_fpath) as f:\n",
    "    next(f)\n",
    "    x_test = np.array([line.strip('\\n').split(',')[1:]   for line in f], dtype = float)\n",
    "    \n",
    "print('x_train :\\n',x_train,x_train.shape,'\\n')\n",
    "print('y_train :\\n',y_train,y_train.shape,'\\n')\n",
    "print('x_test :\\n',x_test,x_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _normalize(x, train = True, specified_column = None, x_mean = None, x_std = None):\n",
    "    '''\n",
    "    This function normalizes specific columns of x\n",
    "    注意，testing data要跟training data的normalize方式一致，要用training data的mean和std，\n",
    "    因此还需要input已知的x_mean和x_std\n",
    "    '''\n",
    "    # 如果没有指定列，那就穷举所有列，这里np.arange类似于range函数，只不过前者创造的对象是array类型\n",
    "    if specified_column == None:\n",
    "        specified_column = np.arange(x.shape[1])\n",
    "    \n",
    "    # train=True: for training data; train=False: for testing data，只计算training data的mean和std\n",
    "    if train:\n",
    "        # axis=0，对指定列求mean，注意np.mean返回的是一个列向量，因此需要用reshape(1,-1)转化成行向量\n",
    "        x_mean = np.mean(x[:, specified_column], axis = 0).reshape(1, -1)\n",
    "        # axis=0，对指定列求std\n",
    "        x_std  = np.std(x[:, specified_column], axis = 0).reshape(1, -1)\n",
    "     \n",
    "    # 对指定列进行normalize，注意相减的两个向量行数不同但列数相同，相当于前者的每一行都减去x_mean这个行向量，除法同理\n",
    "    # 分母加一个很小很小的数是为了避免标准差为0\n",
    "    x[:, specified_column] = (x[:, specified_column] - x_mean) / (x_std + 1e-8)\n",
    "    \n",
    "    return x, x_mean, x_std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def _train_split(x, y, validation_ratio = 0.25):\n",
    "    '''\n",
    "    This function splits data into training set and validation set\n",
    "    '''\n",
    "    train_size = int(len(x) * (1 - validation_ratio))\n",
    "    \n",
    "    #return x,y of training set and validation set  \n",
    "    # 如果返回值为x[:train_size, :]的话会报错，但这两种形式本质上是一样的，存疑\n",
    "    return x[:train_size], y[:train_size], x[train_size:], y[train_size:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "x_training_set :  (48830, 510) \n",
      " [[-0.42755297  0.99959458 -0.1822401  ...  0.80645987 -1.01485522\n",
      "   1.01485522]\n",
      " [ 1.19978056  0.99959458 -0.1822401  ...  0.80645987 -1.01485522\n",
      "   1.01485522]\n",
      " [ 1.63373617 -1.00040555 -0.1822401  ... -1.45536172 -1.01485522\n",
      "   1.01485522]\n",
      " ...\n",
      " [ 0.65733605 -1.00040555 -0.1822401  ...  0.80645987 -1.01485522\n",
      "   1.01485522]\n",
      " [ 0.27762489  0.99959458 -0.1822401  ...  0.28450104 -1.01485522\n",
      "   1.01485522]\n",
      " [ 0.16913599 -1.00040555 -0.1822401  ...  0.80645987  0.98536219\n",
      "  -0.98536219]]\n",
      "------------------------------------------------------------------------\n",
      "y_training_set :  (48830,) \n",
      " [1. 0. 0. ... 1. 0. 0.]\n",
      "------------------------------------------------------------------------\n",
      "x_validation_set :  (5426, 510) \n",
      " [[-0.48179742  0.99959458 -0.1822401  ...  0.80645987  0.98536219\n",
      "  -0.98536219]\n",
      " [-1.24121974 -1.00040555  5.48726602 ...  0.80645987 -1.01485522\n",
      "   1.01485522]\n",
      " [-0.04784181  0.99959458 -0.1822401  ...  0.80645987 -1.01485522\n",
      "   1.01485522]\n",
      " ...\n",
      " [-1.34970864 -1.00040555 -0.1822401  ... -1.10738916  0.98536219\n",
      "  -0.98536219]\n",
      " [ 0.3861138   0.99959458 -0.1822401  ...  0.80645987 -1.01485522\n",
      "   1.01485522]\n",
      " [ 0.3861138  -1.00040555 -0.1822401  ... -1.45536172 -1.01485522\n",
      "   1.01485522]]\n",
      "------------------------------------------------------------------------\n",
      "y_validation_set :  (5426,) \n",
      " [1. 0. 0. ... 0. 0. 0.]\n"
     ]
    }
   ],
   "source": [
    "# normalize training data and testing data\n",
    "x_train, x_mean, x_std = _normalize(x_train, train = True)\n",
    "x_test, _, _ = _normalize(x_test, train = False, x_mean = x_mean, x_std = x_std)\n",
    "\n",
    "# split training data into training set and validation set\n",
    "x_training_set, y_training_set, x_validation_set, y_validation_set = _train_split(x_train, y_train, validation_ratio = 0.1)\n",
    "\n",
    "print('x_training_set : ', x_training_set.shape, '\\n', x_training_set)\n",
    "print('------------------------------------------------------------------------')\n",
    "print('y_training_set : ', y_training_set.shape, '\\n', y_training_set)\n",
    "print('------------------------------------------------------------------------')\n",
    "print('x_validation_set : ', x_validation_set.shape, '\\n', x_validation_set)\n",
    "print('------------------------------------------------------------------------')\n",
    "print('y_validation_set : ', y_validation_set.shape, '\\n', y_validation_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "training_set  Acc： 0.8861560516076182\n",
      "validation_set Acc： 0.877994839660892\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/gehao/anaconda3/envs/ml/lib/python3.6/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
     ]
    }
   ],
   "source": [
    "# 逻辑回归预测\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "lg = LogisticRegression(C = 1.0)\n",
    "lg.fit(x_training_set, y_training_set)\n",
    "\n",
    "y_training_predict = lg.predict(x_training_set)\n",
    "print('training_set  Acc：', lg.score(x_training_set, y_training_set))\n",
    "\n",
    "y_validation_predict = lg.predict(x_validation_set)\n",
    "print('validation_set Acc：', lg.score(x_validation_set, y_validation_set))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6.8 64-bit ('ml': conda)",
   "language": "python",
   "name": "python36864bitmlconda4727b916de1c4a8f8265036fed6d2bb8"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
